home *** CD-ROM | disk | FTP | other *** search
- # Source Generated with Decompyle++
- # File: in.pyo (Python 2.5)
-
- import re
- import copy
-
- try:
- from urlparse import urlsplit
- except ImportError:
- from urllib.parse import urlsplit
-
- from lxml import etree
- from lxml.html import defs
- from lxml.html import fromstring, tostring, XHTML_NAMESPACE
- from lxml.html import _nons, _transform_result
-
- try:
- set
- except NameError:
- from sets import Set as set
-
-
- try:
- unichr = __builtins__['unichr']
- except (NameError, KeyError):
- unichr = chr
-
-
- try:
- unicode = __builtins__['unicode']
- except (NameError, KeyError):
- unicode = str
-
-
- try:
- bytes = __builtins__['bytes']
- except (NameError, KeyError):
- bytes = str
-
-
- try:
- basestring = __builtins__['basestring']
- except (NameError, KeyError):
- basestring = (str, bytes)
-
- __all__ = [
- 'clean_html',
- 'clean',
- 'Cleaner',
- 'autolink',
- 'autolink_html',
- 'word_break',
- 'word_break_html']
- _css_javascript_re = re.compile('expression\\s*\\(.*?\\)', re.S | re.I)
- _css_import_re = re.compile('@\\s*import', re.I)
- _javascript_scheme_re = re.compile('\\s*(?:javascript|jscript|livescript|vbscript|about|mocha):', re.I)
- _substitute_whitespace = re.compile('\\s+').sub
- _conditional_comment_re = re.compile('\\[if[\\s\\n\\r]+.*?][\\s\\n\\r]*>', re.I | re.S)
- _find_styled_elements = etree.XPath('descendant-or-self::*[@style]')
- _find_external_links = etree.XPath("descendant-or-self::a [normalize-space(@href) and substring(normalize-space(@href),1,1) != '#'] |descendant-or-self::x:a[normalize-space(@href) and substring(normalize-space(@href),1,1) != '#']", namespaces = {
- 'x': XHTML_NAMESPACE })
-
- class Cleaner(object):
- scripts = True
- javascript = True
- comments = True
- style = False
- links = True
- meta = True
- page_structure = True
- processing_instructions = True
- embedded = True
- frames = True
- forms = True
- annoying_tags = True
- remove_tags = None
- allow_tags = None
- remove_unknown_tags = True
- safe_attrs_only = True
- add_nofollow = False
- host_whitelist = ()
- whitelist_tags = set([
- 'iframe',
- 'embed'])
-
- def __init__(self, **kw):
- for name, value in kw.items():
- if not hasattr(self, name):
- raise TypeError('Unknown parameter: %s=%r' % (name, value))
-
- setattr(self, name, value)
-
-
- _tag_link_attrs = dict(script = 'src', link = 'href', applet = [
- 'code',
- 'object'], iframe = 'src', embed = 'src', layer = 'src', a = 'href')
-
- def __call__(self, doc):
- if hasattr(doc, 'getroot'):
- doc = doc.getroot()
-
- for el in doc.iter():
- tag = el.tag
- if isinstance(tag, basestring):
- el.tag = _nons(tag)
- continue
-
- for el in doc.iter('image'):
- el.tag = 'img'
-
- if not self.comments:
- self.kill_conditional_comments(doc)
-
- kill_tags = set()
- if not self.remove_tags:
- pass
- remove_tags = set(())
- if self.allow_tags:
- allow_tags = set(self.allow_tags)
- else:
- allow_tags = set()
- if self.scripts:
- kill_tags.add('script')
-
- if self.safe_attrs_only:
- safe_attrs = set(defs.safe_attrs)
- for el in doc.iter():
- attrib = el.attrib
- for aname in attrib.keys():
- if aname not in safe_attrs:
- del attrib[aname]
- continue
-
-
-
- if self.javascript:
- if not self.safe_attrs_only:
- for el in doc.iter():
- attrib = el.attrib
- for aname in attrib.keys():
- if aname.startswith('on'):
- del attrib[aname]
- continue
-
-
-
- doc.rewrite_links(self._remove_javascript_link, resolve_base_href = False)
- if not self.style:
- for el in _find_styled_elements(doc):
- old = el.get('style')
- new = _css_javascript_re.sub('', old)
- new = _css_import_re.sub('', old)
- if self._has_sneaky_javascript(new):
- del el.attrib['style']
- continue
- if new != old:
- el.set('style', new)
- continue
-
- for el in list(doc.iter('style')):
- if el.get('type', '').lower().strip() == 'text/javascript':
- el.drop_tree()
- continue
-
- if not el.text:
- pass
- old = ''
- new = _css_javascript_re.sub('', old)
- new = _css_import_re.sub('', old)
- if self._has_sneaky_javascript(new):
- el.text = '/* deleted */'
- continue
- if new != old:
- el.text = new
- continue
-
-
-
- if self.comments or self.processing_instructions:
- kill_tags.add(etree.Comment)
-
- if self.processing_instructions:
- kill_tags.add(etree.ProcessingInstruction)
-
- if self.style:
- kill_tags.add('style')
- for el in _find_styled_elements(doc):
- del el.attrib['style']
-
-
- if self.links:
- kill_tags.add('link')
- elif self.style or self.javascript:
- for el in list(doc.iter('link')):
- if 'stylesheet' in el.get('rel', '').lower():
- el.drop_tree()
- continue
-
-
- if self.meta:
- kill_tags.add('meta')
-
- if self.page_structure:
- remove_tags.update(('head', 'html', 'title'))
-
- if self.embedded:
- for el in list(doc.iter('param')):
- found_parent = False
- parent = el.getparent()
- while parent is not None and parent.tag not in ('applet', 'object'):
- parent = parent.getparent()
- if parent is None:
- el.drop_tree()
- continue
-
- kill_tags.update(('applet',))
- remove_tags.update(('iframe', 'embed', 'layer', 'object', 'param'))
-
- if self.frames:
- kill_tags.update(defs.frame_tags)
-
- if self.forms:
- remove_tags.add('form')
- kill_tags.update(('button', 'input', 'select', 'textarea'))
-
- if self.annoying_tags:
- remove_tags.update(('blink', 'marque'))
-
- _remove = []
- _kill = []
- for el in doc.iter():
- if el.tag in kill_tags:
- if self.allow_element(el):
- continue
-
- _kill.append(el)
- continue
- if el.tag in remove_tags:
- if self.allow_element(el):
- continue
-
- _remove.append(el)
- continue
-
- if _remove and _remove[0] == doc:
- el = _remove.pop(0)
- el.tag = 'div'
- el.attrib.clear()
- elif _kill and _kill[0] == doc:
- el = _kill.pop(0)
- if el.tag != 'html':
- el.tag = 'div'
-
- el.clear()
-
- for el in _kill:
- el.drop_tree()
-
- for el in _remove:
- el.drop_tag()
-
- allow_tags = self.allow_tags
- if self.remove_unknown_tags:
- if allow_tags:
- raise ValueError('It does not make sense to pass in both allow_tags and remove_unknown_tags')
-
- allow_tags = set(defs.tags)
-
- if allow_tags:
- bad = []
- for el in doc.iter():
- if el.tag not in allow_tags:
- bad.append(el)
- continue
-
- for el in bad:
- el.drop_tag()
-
-
- if self.add_nofollow:
- for el in _find_external_links(doc):
- if not self.allow_follow(el):
- el.set('rel', 'nofollow')
- continue
-
-
-
-
- def allow_follow(self, anchor):
- return False
-
-
- def allow_element(self, el):
- if el.tag not in self._tag_link_attrs:
- return False
-
- attr = self._tag_link_attrs[el.tag]
- if isinstance(attr, (list, tuple)):
- for one_attr in attr:
- url = el.get(one_attr)
- if not url:
- return False
-
- if not self.allow_embedded_url(el, url):
- return False
- continue
-
- return True
- else:
- url = el.get(attr)
- if not url:
- return False
-
- return self.allow_embedded_url(el, url)
-
-
- def allow_embedded_url(self, el, url):
- if self.whitelist_tags is not None and el.tag not in self.whitelist_tags:
- return False
-
- (scheme, netloc, path, query, fragment) = urlsplit(url)
- netloc = netloc.lower().split(':', 1)[0]
- if scheme not in ('http', 'https'):
- return False
-
- if netloc in self.host_whitelist:
- return True
-
- return False
-
-
- def kill_conditional_comments(self, doc):
- bad = []
- self._kill_elements(doc, (lambda el: _conditional_comment_re.search(el.text)), etree.Comment)
-
-
- def _kill_elements(self, doc, condition, iterate = None):
- bad = []
- for el in doc.iter(iterate):
- if condition(el):
- bad.append(el)
- continue
-
- for el in bad:
- el.drop_tree()
-
-
-
- def _remove_javascript_link(self, link):
- new = _substitute_whitespace('', link)
- if _javascript_scheme_re.search(new):
- return ''
-
- return link
-
- _substitute_comments = re.compile('/\\*.*?\\*/', re.S).sub
-
- def _has_sneaky_javascript(self, style):
- style = self._substitute_comments('', style)
- style = style.replace('\\', '')
- style = _substitute_whitespace('', style)
- style = style.lower()
- if 'javascript:' in style:
- return True
-
- if 'expression(' in style:
- return True
-
- return False
-
-
- def clean_html(self, html):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- self(doc)
- return _transform_result(result_type, doc)
-
-
- clean = Cleaner()
- clean_html = clean.clean_html
- _link_regexes = [
- re.compile('(?P<body>https?://(?P<host>[a-z0-9._-]+)(?:/[/\\-_.,a-z0-9%&?;=~]*)?)', re.I),
- re.compile('mailto:(?P<body>[a-z0-9._-]+@(?P<host>[a-z0-9_._]+[a-z]))', re.I)]
- _avoid_elements = [
- 'textarea',
- 'pre',
- 'code',
- 'head',
- 'select',
- 'a']
- _avoid_hosts = [
- re.compile('^localhost', re.I),
- re.compile('\\bexample\\.(?:com|org|net)$', re.I),
- re.compile('^127\\.0\\.0\\.1$')]
- _avoid_classes = [
- 'nolink']
-
- def autolink(el, link_regexes = _link_regexes, avoid_elements = _avoid_elements, avoid_hosts = _avoid_hosts, avoid_classes = _avoid_classes):
- if el.tag in avoid_elements:
- return None
-
- class_name = el.get('class')
- if class_name:
- class_name = class_name.split()
- for match_class in avoid_classes:
- if match_class in class_name:
- return None
- continue
-
-
- for child in list(el):
- autolink(child, link_regexes = link_regexes, avoid_elements = avoid_elements, avoid_hosts = avoid_hosts, avoid_classes = avoid_classes)
- if child.tail:
- (text, tail_children) = _link_text(child.tail, link_regexes, avoid_hosts, factory = el.makeelement)
- if tail_children:
- child.tail = text
- index = el.index(child)
- el[index + 1:index + 1] = tail_children
-
- tail_children
-
- if el.text:
- (text, pre_children) = _link_text(el.text, link_regexes, avoid_hosts, factory = el.makeelement)
- if pre_children:
- el.text = text
- el[:0] = pre_children
-
-
-
-
- def _link_text(text, link_regexes, avoid_hosts, factory):
- leading_text = ''
- links = []
- last_pos = 0
- while None:
- (best_match, best_pos) = (None, None)
- for regex in link_regexes:
- regex_pos = last_pos
- while None:
- match = regex.search(text, pos = regex_pos)
- if match is None:
- break
-
- host = match.group('host')
- for host_regex in avoid_hosts:
- if host_regex.search(host):
- regex_pos = match.end()
- break
- continue
- else:
- break
- continue
- if match is None:
- continue
-
- if best_pos is None or match.start() < best_pos:
- best_match = match
- best_pos = match.start()
- continue
-
- if best_match is None:
- if links:
- links[-1].tail = text
- else:
- leading_text = text
- break
-
- link = best_match.group(0)
- end = best_match.end()
- if link.endswith('.') or link.endswith(','):
- end -= 1
- link = link[:-1]
-
- prev_text = text[:best_match.start()]
- if links:
- links[-1].tail = prev_text
- else:
- leading_text = prev_text
- anchor = factory('a')
- body = best_match.group('body')
- if not body:
- body = link
-
- if body.endswith('.') or body.endswith(','):
- body = body[:-1]
-
- anchor.text = body
- links.append(anchor)
- text = text[end:]
- continue
- return (leading_text, links)
-
-
- def autolink_html(html, *args, **kw):
- result_type = type(html)
- if isinstance(html, basestring):
- doc = fromstring(html)
- else:
- doc = copy.deepcopy(html)
- autolink(doc, *args, **kw)
- return _transform_result(result_type, doc)
-
- autolink_html.__doc__ = autolink.__doc__
- _avoid_word_break_elements = [
- 'pre',
- 'textarea',
- 'code']
- _avoid_word_break_classes = [
- 'nobreak']
-
- def word_break(el, max_width = 40, avoid_elements = _avoid_word_break_elements, avoid_classes = _avoid_word_break_classes, break_character = unichr(8203)):
- if el.tag in _avoid_word_break_elements:
- return None
-
- class_name = el.get('class')
- if class_name:
- dont_break = False
- class_name = class_name.split()
- for avoid in avoid_classes:
- if avoid in class_name:
- dont_break = True
- break
- continue
-
- if dont_break:
- return None
-
-
- if el.text:
- el.text = _break_text(el.text, max_width, break_character)
-
- for child in el:
- word_break(child, max_width = max_width, avoid_elements = avoid_elements, avoid_classes = avoid_classes, break_character = break_character)
- if child.tail:
- child.tail = _break_text(child.tail, max_width, break_character)
- continue
-
-
-
- def word_break_html(html, *args, **kw):
- result_type = type(html)
- doc = fromstring(html)
- word_break(doc, *args, **kw)
- return _transform_result(result_type, doc)
-
-
- def _break_text(text, max_width, break_character):
- words = text.split()
- for word in words:
- if len(word) > max_width:
- replacement = _insert_break(word, max_width, break_character)
- text = text.replace(word, replacement)
- continue
-
- return text
-
- _break_prefer_re = re.compile('[^a-z]', re.I)
-
- def _insert_break(word, width, break_character):
- orig_word = word
- result = ''
- while len(word) > width:
- start = word[:width]
- breaks = list(_break_prefer_re.finditer(start))
- if breaks:
- last_break = breaks[-1]
- if last_break.end() > width - 10:
- start = word[:last_break.end()]
-
-
- result += start + break_character
- word = word[len(start):]
- result += word
- return result
-
-